from atexit import unregister
from collections import Counter
from sklearn.linear_model import LogisticRegression
from imblearn import over_sampling, under_sampling, combine, ensemble
import pandas as pd
import numpy as np



#--------------------------------------------------------------------------------------------------
# oversampling 
# ros = over_sampling.RandomOverSampler(sampling_strategy={0: 700,1:200,2:150 },random_state=0)
# smo = over_sampling.SMOTE(sampling_strategy={0: 700,1:200,2:150 },random_state=42)
# Blsmo = over_sampling.BorderlineSMOTE(kind='borderline-1',sampling_strategy={0: 700,1:200,2:150 },random_state=42)
# ana = over_sampling.ADASYN(sampling_strategy={0: 800,2:300,1:400 },random_state=0)
# kms = over_sampling.KMeansSMOTE(sampling_strategy={0: 800,2:300,1:400 },random_state=42)
# sm = over_sampling.SMOTENC(random_state=42, categorical_features=[18, 19])
# svmm = over_sampling.SVMSMOTE(sampling_strategy={0: 800,2:300,1:400 },random_state=42)

# # 用 SMOTE 合成的样本分布比较平均，而Border-line SMOTE合成的样本则集中在类别边界处。ADASYN的特性是一个少数类样本周围多数类样本越多，则算法会为其生成越多的样本，从图中也可以看到生成的样本大都来自于原来与多数类比较靠近的那些少数类样本。

# # # undersampling 
# cc = under_sampling.ClusterCentroids(sampling_strategy={0: 50,2:100,1:100 },random_state=0)
# cc = under_sampling.RandomUnderSampler(sampling_strategy={0: 50,2:100,1:100 },random_state=0)
# #nm1 = under_sampling.NearMiss(sampling_strategy={0: 50,2:100,1:100 },random_state=0, version=1)
# tl = under_sampling.TomekLinks()
# enn = under_sampling.EditedNearestNeighbours()
# renn = under_sampling.RepeatedEditedNearestNeighbours()
# allknn = under_sampling.AllKNN()
# cnn = under_sampling.CondensedNearestNeighbour(random_state=42) 
# oss = under_sampling.OneSidedSelection(random_state=42)
# ncr = under_sampling.NeighbourhoodCleaningRule()
# iht = under_sampling.InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression())

# # combine oversampling and undersampling
# smote_enn = combine.SMOTEENN(random_state=0)
# smote_tomek = combine.SMOTETomek(sampling_strategy={0: 700,1:300,2:200 },random_state=0)

# ensemble
# ee = ensemble.EasyEnsembleClassifier(random_state=0, n_subsets=10)
#bc = ensemble.BalancedRandomForestClassifier(sampling_strategy={0: 500,1:199,2:89 },random_state=0, estimator=LogisticRegression(random_state=0), n_max_subset=4)

#--------------------------------------------------------------------------------------------------

# 上下采样过程，从上面选择对应算法
filename = r'D:\test.csv'
data = pd.read_csv(filename ,low_memory=False, delimiter=',')

features = data.iloc[::, 0:-1]
label = data.iloc[::, -1]

Blsmo = over_sampling.BorderlineSMOTE(kind='borderline-1',sampling_strategy={0: 100,1:100,2:100 },random_state=42)
x, y = Blsmo.fit_resample(features, label)
print(Counter(y)) # data['class1'].values_counts()

# 保存为csv
result = pd.concat([x, y], axis=1) # 横向合并
result.columns = data.columns.values
result.to_csv(r'D:\test_out.csv', index = False)